Data Analysis and Data Analytics¶

In [97]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from PIL import Image, ImageEnhance
import matplotlib.cm as cm
from PIL import Image, ImageEnhance
In [98]:
match_df = pd.read_csv(r"C:\Users\Justin\Downloads\final_agg_match_stats.csv")
match_df
Out[98]:
date game_size match_id match_mode party_size player_assists player_dbno player_dist_ride player_dist_walk player_dmg player_kills player_name player_survive_time team_id team_placement
0 2017-11-26 37 2U4GBNA0YmnNZYkzjkfgN4ev-hXSrak_BSey_YEG6kIuDG... tpp 2 0 1 2870.72400 1784.847780 117 1 SnuffIes 18.438667 4 18
1 2017-11-26 37 2U4GBNA0YmnNZYkzjkfgN4ev-hXSrak_BSey_YEG6kIuDG... tpp 2 0 1 2938.40723 1756.079710 127 1 Ozon3r 18.438583 4 18
2 2017-11-26 37 2U4GBNA0YmnNZYkzjkfgN4ev-hXSrak_BSey_YEG6kIuDG... tpp 2 0 0 0.00000 224.157562 67 0 bovize 3.925967 5 33
3 2017-11-26 37 2U4GBNA0YmnNZYkzjkfgN4ev-hXSrak_BSey_YEG6kIuDG... tpp 2 0 0 0.00000 92.935150 0 0 sbahn87 3.292550 5 33
4 2017-11-26 37 2U4GBNA0YmnNZYkzjkfgN4ev-hXSrak_BSey_YEG6kIuDG... tpp 2 0 0 2619.07739 2510.447000 175 2 GeminiZZZ 25.624917 14 11
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
9995 2017-11-20 29 2U4GBNA0Ymlm4JHKbbmQ9x9_rZbdELcOlVMjnXNbpV6MTm... tpp 4 0 2 5947.79000 1629.568000 87 2 2thikk4u 22.191600 6 7
9996 2017-11-20 29 2U4GBNA0Ymlm4JHKbbmQ9x9_rZbdELcOlVMjnXNbpV6MTm... tpp 4 0 0 4421.93262 3212.797850 137 2 peckerpecker 19.357767 6 7
9997 2017-11-20 29 2U4GBNA0Ymlm4JHKbbmQ9x9_rZbdELcOlVMjnXNbpV6MTm... tpp 4 0 2 0.00000 873.056300 193 0 Bawngfist 6.027517 6 7
9998 2017-11-20 29 2U4GBNA0Ymlm4JHKbbmQ9x9_rZbdELcOlVMjnXNbpV6MTm... tpp 4 0 1 0.00000 744.918152 87 1 Coots_McGoots 8.293617 7 14
9999 2017-11-20 29 2U4GBNA0Ymlm4JHKbbmQ9x9_rZbdELcOlVMjnXNbpV6MTm... tpp 4 1 0 0.00000 670.910339 12 0 sbwwt 8.252450 7 14

10000 rows × 15 columns

In [99]:
kill_df = pd.read_csv(r"C:\Users\Justin\Downloads\final_kill_match_stats.csv")
kill_df
Out[99]:
killed_by killer_name killer_placement killer_position_x killer_position_y map match_id time victim_name victim_placement victim_position_x victim_position_y
0 Down and Out Malcolm_x 9.0 496989.8 312569.7 ERANGEL 2U4GBNA0YmnRe95wOy7kuweIkuZo5Roa0WjNZsgboi2gzz... 17.250000 Player 180 22.0 497385.4 331528.2
1 M16A4 Malcolm_x 9.0 496989.8 312569.7 ERANGEL 2U4GBNA0YmnRe95wOy7kuweIkuZo5Roa0WjNZsgboi2gzz... 17.250000 Player 181 22.0 497819.4 331981.3
2 AKM G_Berg 7.0 460416.7 414748.8 ERANGEL 2U4GBNA0YmnRe95wOy7kuweIkuZo5Roa0WjNZsgboi2gzz... 23.700000 Player 182 16.0 459817.9 414426.3
3 AKM Lukesnake17 20.0 488034.1 347220.3 ERANGEL 2U4GBNA0YmnRe95wOy7kuweIkuZo5Roa0WjNZsgboi2gzz... 20.166667 Player 183 9.0 487444.2 347651.0
4 SKS AlooGobi 2.0 501062.9 425078.6 ERANGEL 2U4GBNA0YmnRe95wOy7kuweIkuZo5Roa0WjNZsgboi2gzz... 30.300000 Player 184 3.0 493043.4 434458.1
... ... ... ... ... ... ... ... ... ... ... ... ...
8229 SKS dragonfruitbamf 2.0 536215.7 365051.5 ERANGEL 2U4GBNA0YmnP7JxGpV6xH481nTM-AN2Ig_D_m3DRTNjINB... 27.366667 Player 9996 6.0 527811.1 375679.6
8230 M416 lddoos 3.0 550700.4 364211.8 ERANGEL 2U4GBNA0YmnP7JxGpV6xH481nTM-AN2Ig_D_m3DRTNjINB... 27.200000 Player 9997 6.0 522475.8 375727.8
8231 Down and Out Nan_P 3.0 507691.3 388946.0 ERANGEL 2U4GBNA0YmnP7JxGpV6xH481nTM-AN2Ig_D_m3DRTNjINB... 26.800000 Player 9998 6.0 513487.8 377484.4
8232 Grenade brentech 7.0 503235.2 351960.6 ERANGEL 2U4GBNA0YmnP7JxGpV6xH481nTM-AN2Ig_D_m3DRTNjINB... 25.866667 Player 9999 9.0 507682.2 354356.8
8233 SCAR-L SwwH 13.0 447508.2 625632.6 ERANGEL 2U4GBNA0YmnP7JxGpV6xH481nTM-AN2Ig_D_m3DRTNjINB... 8.683333 Player 10000 20.0 447384.1 625920.0

8234 rows × 12 columns

1. What party size should we choose?¶

The most winning party size in PUBG

In [100]:
# filter and select the rows with 'team_placement' = 1

win_df = match_df.drop(match_df[match_df['team_placement']!=1].index)

# filter and select the rows with 'party_size' = 1,2,4 and store into different dataframes

solo_df = win_df.drop(win_df[win_df['party_size']!=1].index)
double_df = win_df.drop(win_df[win_df['party_size']!=2].index)
quad_df = win_df.drop(win_df[win_df['party_size']!=4].index)

# count the number of rows in each dataframe for furthur analysis 

solo_win = int(solo_df['team_placement'].value_counts())
double_win = int(double_df['team_placement'].value_counts())
quadruple_win = int(quad_df['team_placement'].value_counts())
In [101]:
# set the colours for pie chart

colorss = plt.cm.coolwarm(np.linspace(0.4,8))

# store and set the category(labels), and values(count) in each dataframe into a list

categories = ['Solo','Duo','Squad']
values = [solo_win, double_win, quadruple_win]

# separate the pie chart into sectors

explode = (0.05,0.05,0.05)
plt.figure(figsize=(7, 7))

# plot the pie chart, with shadow attribute

plt.pie(values, explode=explode, colors=colorss, shadow = True, autopct='%.4f%%')

# set a title for pie chart

plt.title('Probability to win (party size)')

# set a category table each categories

plt.legend(categories, loc = 'upper right') 

# enhance the quality of the output

plt.tight_layout()

# display our plot

plt.show()

2. Where is the safest place to land?¶

Visualisation of dangerous locations in PUBG Erangel map

In [102]:
# Data normalisation: Rescale the data in the "position" columns from their original values to a different scale.
# This is to bring the values into a more smaller range for further analysis or visualization.

kill2_df = kill_df.copy()
position_data = ["victim_position_x", "victim_position_y","killer_position_x", "killer_position_y"]
for position in position_data:
    
    # lambda is used to perform a simple calculation on each element of a column, without defining a specific function for it
    # Data normalisation
    
    kill2_df[position] = kill2_df[position].apply(lambda x: x*4000/800000)
    
    # Remove any rows where the position value is zero.
    
    kill2_df = kill2_df[kill2_df[position]!=0]
    
# Display the updated dataframe

kill2_df
Out[102]:
killed_by killer_name killer_placement killer_position_x killer_position_y map match_id time victim_name victim_placement victim_position_x victim_position_y
0 Down and Out Malcolm_x 9.0 2484.9490 1562.8485 ERANGEL 2U4GBNA0YmnRe95wOy7kuweIkuZo5Roa0WjNZsgboi2gzz... 17.250000 Player 180 22.0 2486.9270 1657.6410
1 M16A4 Malcolm_x 9.0 2484.9490 1562.8485 ERANGEL 2U4GBNA0YmnRe95wOy7kuweIkuZo5Roa0WjNZsgboi2gzz... 17.250000 Player 181 22.0 2489.0970 1659.9065
2 AKM G_Berg 7.0 2302.0835 2073.7440 ERANGEL 2U4GBNA0YmnRe95wOy7kuweIkuZo5Roa0WjNZsgboi2gzz... 23.700000 Player 182 16.0 2299.0895 2072.1315
3 AKM Lukesnake17 20.0 2440.1705 1736.1015 ERANGEL 2U4GBNA0YmnRe95wOy7kuweIkuZo5Roa0WjNZsgboi2gzz... 20.166667 Player 183 9.0 2437.2210 1738.2550
4 SKS AlooGobi 2.0 2505.3145 2125.3930 ERANGEL 2U4GBNA0YmnRe95wOy7kuweIkuZo5Roa0WjNZsgboi2gzz... 30.300000 Player 184 3.0 2465.2170 2172.2905
... ... ... ... ... ... ... ... ... ... ... ... ...
8229 SKS dragonfruitbamf 2.0 2681.0785 1825.2575 ERANGEL 2U4GBNA0YmnP7JxGpV6xH481nTM-AN2Ig_D_m3DRTNjINB... 27.366667 Player 9996 6.0 2639.0555 1878.3980
8230 M416 lddoos 3.0 2753.5020 1821.0590 ERANGEL 2U4GBNA0YmnP7JxGpV6xH481nTM-AN2Ig_D_m3DRTNjINB... 27.200000 Player 9997 6.0 2612.3790 1878.6390
8231 Down and Out Nan_P 3.0 2538.4565 1944.7300 ERANGEL 2U4GBNA0YmnP7JxGpV6xH481nTM-AN2Ig_D_m3DRTNjINB... 26.800000 Player 9998 6.0 2567.4390 1887.4220
8232 Grenade brentech 7.0 2516.1760 1759.8030 ERANGEL 2U4GBNA0YmnP7JxGpV6xH481nTM-AN2Ig_D_m3DRTNjINB... 25.866667 Player 9999 9.0 2538.4110 1771.7840
8233 SCAR-L SwwH 13.0 2237.5410 3128.1630 ERANGEL 2U4GBNA0YmnP7JxGpV6xH481nTM-AN2Ig_D_m3DRTNjINB... 8.683333 Player 10000 20.0 2236.9205 3129.6000

7811 rows × 12 columns

In [103]:
# Data Reduction (Feature selection - selecting a subset of the original features that are most relevant to the analysis):
# Use only rows with "time" less than 5

# Extracting the rows for the shortest survival time (5 minutes)

kill_sample = kill2_df[kill2_df["time"]<5]

# Display the new dataframe

kill_sample
Out[103]:
killed_by killer_name killer_placement killer_position_x killer_position_y map match_id time victim_name victim_placement victim_position_x victim_position_y
6 Down and Out Snowzcone 44.0 2330.8655 3211.4760 ERANGEL 2U4GBNA0YmnRe95wOy7kuweIkuZo5Roa0WjNZsgboi2gzz... 1.533333 Player 186 49.0 2331.0745 3205.2575
9 S1897 MaelstromPhoenix 14.0 2232.7690 3107.5790 ERANGEL 2U4GBNA0YmnRe95wOy7kuweIkuZo5Roa0WjNZsgboi2gzz... 1.750000 Player 189 46.0 2233.4915 3105.1705
12 P92 Homebrw 9.0 1773.6730 1985.1985 ERANGEL 2U4GBNA0YmnRe95wOy7kuweIkuZo5Roa0WjNZsgboi2gzz... 4.650000 Player 192 38.0 1768.3455 1987.7230
13 Micro UZI NoMersee 26.0 1689.2995 803.0060 ERANGEL 2U4GBNA0YmnRe95wOy7kuweIkuZo5Roa0WjNZsgboi2gzz... 2.766667 Player 194 42.0 1696.7280 801.3480
17 M416 biubiu_RNG 43.0 2255.2645 3148.2005 ERANGEL 2U4GBNA0YmnRe95wOy7kuweIkuZo5Roa0WjNZsgboi2gzz... 2.633333 Player 198 37.0 2253.9605 3147.8755
... ... ... ... ... ... ... ... ... ... ... ... ...
8213 Down and Out GreatPandaKing 24.0 3574.5335 1651.7530 ERANGEL 2U4GBNA0YmnP7JxGpV6xH481nTM-AN2Ig_D_m3DRTNjINB... 3.933333 Player 9980 26.0 3556.4780 1639.7185
8215 P1911 Juarezneverfalls 26.0 3558.7235 1635.6795 ERANGEL 2U4GBNA0YmnP7JxGpV6xH481nTM-AN2Ig_D_m3DRTNjINB... 3.683333 Player 9982 24.0 3560.2685 1636.8060
8217 Tommy Gun Go_getter 24.0 3561.1820 1624.1475 ERANGEL 2U4GBNA0YmnP7JxGpV6xH481nTM-AN2Ig_D_m3DRTNjINB... 2.316667 Player 9984 26.0 3556.2920 1634.1975
8219 UMP9 Hking_909 24.0 3562.6530 1636.0950 ERANGEL 2U4GBNA0YmnP7JxGpV6xH481nTM-AN2Ig_D_m3DRTNjINB... 4.033333 Player 9986 26.0 3559.8120 1641.4650
8223 Tommy Gun Go_getter 24.0 3564.9865 1642.8575 ERANGEL 2U4GBNA0YmnP7JxGpV6xH481nTM-AN2Ig_D_m3DRTNjINB... 3.450000 Player 9990 26.0 3569.9880 1647.2005

2488 rows × 12 columns

In [104]:
# Import miramar image

image = Image.open(r"C:\Users\Justin\Downloads\ERANGEL.jpg")

# Adjust the brightness if map

enhancer = ImageEnhance.Brightness(image)

# Increase this value to make it lighter

brightness_factor = 1.1  

# apply the adjusted brightness to the image

brightened_image = enhancer.enhance(brightness_factor)

# Prepare the figure and subplot for further customization and plotting of data
# (1, 1) indicate that we want a single subplot in our figure
# figsize=(15, 15) sets the size of the figure to 15 inches by 15 inches

fig, ax = plt.subplots(1, 1, figsize=(15, 15)) 

# plot the KDE plot with both 

plot = sns.kdeplot(data=kill_sample, x="victim_position_x",y="victim_position_y",n_levels=100, cmap=cm.Reds, alpha=0.9,ax=ax)

# label the axis and title name 

plt.xlabel('x-coordinate', fontsize=18)
plt.ylabel('y-coordinate', fontsize=18)
plt.title('Most dangerous locations', fontsize=20)

# Get the limits of the axis and assign into the variables (min, max)

x_min, x_max = ax.get_xlim()
y_min, y_max = ax.get_ylim()

# aspect='auto',is used to maintain the original proportions of the image
# Display the image with plottings to the axis limits

ax.imshow(brightened_image, extent=[x_min, x_max, y_min, y_max], aspect='auto')
plt.show()

3. What weapon should we use?¶

List of Top 15 gun weapons used in PUBG

In [105]:
# Define the list of gun categories to keep

guns = ['AKM','M16A4','SCAR-L','M416','Groza','M762',
        'Kar98K','M24','AWM',
        'SKS','VSS','Mini 14','Mk14','SLR',
        'Micro UZI','UMP9','Vector','Tommy Gun',
        'S686','S1897','S12K',
        'M249','DP-28',
        'P92','P1911','R1895','P18C']

# Keep only the rows with the specified gun categories

guns_df = kill_df[kill_df['killed_by'].isin(guns)]

# Show the filtered DataFrame

guns_df
Out[105]:
killed_by killer_name killer_placement killer_position_x killer_position_y map match_id time victim_name victim_placement victim_position_x victim_position_y
1 M16A4 Malcolm_x 9.0 496989.8 312569.7 ERANGEL 2U4GBNA0YmnRe95wOy7kuweIkuZo5Roa0WjNZsgboi2gzz... 17.250000 Player 181 22.0 497819.4 331981.3
2 AKM G_Berg 7.0 460416.7 414748.8 ERANGEL 2U4GBNA0YmnRe95wOy7kuweIkuZo5Roa0WjNZsgboi2gzz... 23.700000 Player 182 16.0 459817.9 414426.3
3 AKM Lukesnake17 20.0 488034.1 347220.3 ERANGEL 2U4GBNA0YmnRe95wOy7kuweIkuZo5Roa0WjNZsgboi2gzz... 20.166667 Player 183 9.0 487444.2 347651.0
4 SKS AlooGobi 2.0 501062.9 425078.6 ERANGEL 2U4GBNA0YmnRe95wOy7kuweIkuZo5Roa0WjNZsgboi2gzz... 30.300000 Player 184 3.0 493043.4 434458.1
5 M416 Powfa 3.0 495501.2 429826.2 ERANGEL 2U4GBNA0YmnRe95wOy7kuweIkuZo5Roa0WjNZsgboi2gzz... 29.433333 Player 185 6.0 495755.7 438322.0
... ... ... ... ... ... ... ... ... ... ... ... ...
8226 AKM Addictted 2.0 535211.4 364872.2 ERANGEL 2U4GBNA0YmnP7JxGpV6xH481nTM-AN2Ig_D_m3DRTNjINB... 29.000000 Player 9993 3.0 535258.8 365134.4
8228 SCAR-L SurpriseMtheFker 13.0 435366.4 625140.6 ERANGEL 2U4GBNA0YmnP7JxGpV6xH481nTM-AN2Ig_D_m3DRTNjINB... 7.250000 Player 9995 20.0 447501.6 628886.9
8229 SKS dragonfruitbamf 2.0 536215.7 365051.5 ERANGEL 2U4GBNA0YmnP7JxGpV6xH481nTM-AN2Ig_D_m3DRTNjINB... 27.366667 Player 9996 6.0 527811.1 375679.6
8230 M416 lddoos 3.0 550700.4 364211.8 ERANGEL 2U4GBNA0YmnP7JxGpV6xH481nTM-AN2Ig_D_m3DRTNjINB... 27.200000 Player 9997 6.0 522475.8 375727.8
8233 SCAR-L SwwH 13.0 447508.2 625632.6 ERANGEL 2U4GBNA0YmnP7JxGpV6xH481nTM-AN2Ig_D_m3DRTNjINB... 8.683333 Player 10000 20.0 447384.1 625920.0

5370 rows × 12 columns

In [106]:
# make the graphs to have gradient colors

colors = plt.cm.Oranges(np.linspace(0.5,2))

# Plot a horizontal bar chart that shows the top 15 weapons by kill count

# value_counts() - count the unique values in the 'killed_by' column - returns a Series
# select the top 15 values and sort the values in series in ascending order

guns_df['killed_by'].value_counts()[:15].sort_values(ascending = True).plot.barh(figsize=(10,7),color = colors, zorder = 2)

# set the title and name both axis
# add grids to the background of graph
# zorder - the layer of the grids or bars; alpha - transparency of grid

plt.title('Top 15 Weapons by Kill Count', fontsize = 20)
plt.xlabel('Kill Count', fontsize = 15)
plt.ylabel('Weapon', fontsize = 15)
plt.grid(zorder = 1, alpha = 0.4)
In [107]:
# count the unique values in the 'killed_by' column
# then thr top 15 from 'counts' will be selected
# top_15 will be sorted in descending order

counts = guns_df['killed_by'].value_counts()
top_15 = counts[:15]
sorted_top_15 = top_15.sort_values(ascending=False)

# new dataframe with two column, using the sorted_top_15 Series

top15_df = pd.DataFrame({'Weapons':sorted_top_15.index,'Counts':sorted_top_15.values})
In [108]:
# Match the weapons to respective gun types

# Categorise gun types using dictionaries
# key - gun type, value - weapon names

gun_categories = {
        'Assault Rifles':['AKM','M16A4','SCAR-L','M416','Groza','M762'],
        'Lever Action Sniper Rifles':['Kar98K','M24','AWM'],
        'Automatic Snifer Rifles':['SKS','VSS','Mini 14','Mk14','SLR'],
        'SubMachine Guns':['Micro UZI','UMP9','Vector','Tommy Gun'],
        'Shotguns':['S686','S1897','S12K'],
        'Light Machine Guns':['M249','DP-28'],
        'Pistols':['P92','P1911','R1895','P18C']
}

# Define a mapping function
# the parameter 'weapon' represents the value in the column "Weapons"
# the function then iterates (performs repeatedly) over the items of the gun_categories dictionary using the .items() method
# 'gun_categories' dict - key: gun types, value: weapon names
# in the loop, if 'weapon' presents in the 'value' list (weapon names) in the dict
# means the 'weapon' belongs to the gun type, the corresponding 'key' is returned
# if no match is found, the function returns 'None'

def map_gun_type(weapon):
    for key, values in gun_categories.items():
        if weapon in values:
            return key
    return None

# Map values using the mapping function

top15_df["Gun type"] = top15_df["Weapons"].map(map_gun_type)

# Use the same way to match the weapons to respective ammo types

# Categorise gun types using dictionaries
# key - ammo type, value - weapon names

ammo_categories = {'.300 Magnum Ammo':'AWM',
            '.45 ACP Ammo':['P1911','Tommy Gun','UMP9'],
            '12 Gauge Ammo':['S1897','S686','S12K'],
            '5.56mm Ammo':['M16A4','M249','M416','SCAR-L','Mini 14'],
            '7.62mm Ammo':['AKM','DP-28','Groza','M762','Kar98K','M24','Mk14','R1895','SKS','SLR'],
            '9mm Ammo':['Micro UZI','P92','VSS','P18C','Vector']
}

# Define a mapping function
# the parameter 'weapon' represents the value in the column "Weapons"
# the function then iterates (performs repeatedly) over the items of the ammo_cat dictionary using the .items() method
# 'ammo_cat' dict - key: ammo types, value: weapon names
# in the loop, if 'weapon' presents in the 'value' list (weapon names) in the dict
# means the 'weapon' belongs to the gun type, the corresponding 'key' is returned
# if no match is found, the function returns 'None'

def map_ammo_type(weapon):
    for key, values in ammo_categories.items():
        if weapon in values:
            return key
    return None

# Map values using the mapping function

top15_df["Ammo type"] = top15_df["Weapons"].map(map_ammo_type)

# let the index starts from 1 (instead of 0)

top15_df.index = range(1, len(top15_df) + 1)

# Show the final list of Top 15 Weapons, followed by Gun and Ammo type

top15_df
Out[108]:
Weapons Counts Gun type Ammo type
1 M416 845 Assault Rifles 5.56mm Ammo
2 SCAR-L 755 Assault Rifles 5.56mm Ammo
3 AKM 689 Assault Rifles 7.62mm Ammo
4 M16A4 678 Assault Rifles 5.56mm Ammo
5 UMP9 459 SubMachine Guns .45 ACP Ammo
6 S1897 361 Shotguns 12 Gauge Ammo
7 Mini 14 270 Automatic Snifer Rifles 5.56mm Ammo
8 SKS 214 Automatic Snifer Rifles 7.62mm Ammo
9 S686 212 Shotguns 12 Gauge Ammo
10 S12K 201 Shotguns 12 Gauge Ammo
11 Micro UZI 166 SubMachine Guns 9mm Ammo
12 P1911 111 Pistols .45 ACP Ammo
13 Tommy Gun 98 SubMachine Guns .45 ACP Ammo
14 P92 96 Pistols 9mm Ammo
15 Vector 45 SubMachine Guns 9mm Ammo

4. Should we assist our teammates or play alone?¶

Relationship between Number of Assists & Probability to Win

In [109]:
# make the graphs to have gradient colors

colors = plt.cm.Oranges(np.linspace(0.3,3))

# create new column 'winner', values in the column are true if 'team_placement' equals to 1
# takes only rows with the top1 player data

match_df['winner'] = match_df['team_placement'] == 1

# new dataframe is created by selecting specific columns 'player_assists' and 'winner'
# only the rows with 'party size' not equal to 1 are included in the new dataframe
# we dont want rows with 'party size' equal to 1

assists_df = match_df.loc[match_df['party_size']!=1, ['player_assists','winner']]

# Plot a bar chart that shows the relationship between number of assists and probability to win

# all rows in df that have the same unique values in the 'player_assists' column are grouped together
# calculates the mean value of the 'winner' column for each group
# the mean will represent the proportion of wins for each 'number of assists' group
# plot.bar() creates a bar chart of the mean values calculated for each category
# rot & figsize is optional
# color = colors - sets the bars to have gradient colors
# zorder - sets the order of the bar chart displayed in the graph

assists_df.groupby('player_assists').winner.mean().plot.bar(rot = 0, figsize = (10,5), color = colors, zorder = 2)

# set titles and name both axis
# add grids to the background of graph
# zorder - the layer of the grids or bars; alpha - transparency of grid

plt.title("Relationship between Number of Assists & Probability to Win", fontsize = 18)
plt.xlabel("Number of assists by player", fontsize = 14)
plt.ylabel("Probability to Win", fontsize = 14)
plt.grid(axis = 'y', zorder = 1)

5. Should we drive or walk?¶

1. Relationship between Driving & Probability to Win

In [110]:
# create new column 'winner', values in the column are true if 'team_placement' equals to 1
# takes only rows with the top1 player data

match_df['winner'] = match_df['team_placement'] == 1

# create new column 'winner', values in the column are true if driving distance doesnt equal to 0
# takes only rows with the driving distance

match_df['drove'] = match_df['player_dist_ride'] != 0

# Plot a horizontal bar chart that shows the relationship between driving and probability to win

# groups the values in the 'drove' column and create two groups - for True values and False values
# can calculate the proportion of 'True' values in each group
# plot.barh() creates a horizontal bar chart of the mean values calculated for each category
# figsize = (width, height) sets the size of the figure
# without figsize = (w,h) also can, will show figure by default size

match_df.groupby('drove').winner.mean().plot.barh(figsize = (10,5), color = "Orange", zorder = 2)

# set titles, name both axis and set labels for y-axis
# add grids to the background of graph
# zorder - the layer of the grids or bars; alpha - transparency of grid

plt.title("Relationship between Driving & Probability to Win", fontsize = 18)
plt.xlabel("Probability to Win", fontsize = 14)
plt.ylabel("Whether players drive or not", fontsize = 14)
plt.yticks([1,0],["Drive","Don't drive"])
plt.grid(zorder = 1, alpha = 0.5)

2. Relationship between Driving Distance & Probability to Win

In [111]:
# make the graphs to have gradient colors

colors = plt.cm.Oranges(np.linspace(0.4,2))

# !!
# df.loc[df[condition,['column1','column2']
# select rows from the dataframe we imported (df_match)
# rows with the values < 10000 under the column 'player_dist_ride'
# only include these columns in new dataframe (distance_df): 'player_dist_ride','player_survive_time'

distance_df = match_df.loc[match_df['player_dist_ride'] < 10000, ['player_dist_ride','winner']]

# create labels to categorise the distance (10 groups) 

label_distance = ["0-1k","1k-2k","2k-3k","3k-4k","4k-5k","5k-6k","6k-7k","7k-8k","8k-9k","9k-10k"]

# pd.cut(df['column'],number of intervals,labels = labelname)
# pd.cut() divides data into a specified number of categories or intervals
# categorise the distance values and match to the labels

distance_df['drive_distance'] = pd.cut(distance_df['player_dist_ride'], 10, labels = label_distance)

# Plot a bar chart that shows the relationship between driving distance and probability to win

# 1. df.groupby('column') groups the data in df based on the unique values in column
# 2. all rows in df that have the same value in the column are grouped together
# 3. by applying groupby(), distinct groups based on drive_distance categories are created
# 4. so can calculate the mean value of the 'winner' column for each group,
#    the mean will represent the proportion of wins for each 'drive_distance' group
# 5. plot.bar() creates a bar chart of the mean values calculated for each category
# 6. rot & figsize is optional
# 7. 'rot = 0' rotates the label by 0 degrees = no rotation = horizontally (easier to read)
# 8. if 'rot = 45' rotates the label by 45 degrees (clockwise)
# 9. if no put 'rot = 0', the label would be in 90 degrees (or by default angle), vertically shown
# 10. figsize = (width, height) sets the size of the figure
# 11. color = colors - sets the bars to have gradient colors
# 12. zorder - sets the order of the bar chart displayed in the graph

distance_df.groupby('drive_distance').winner.mean().plot.bar(rot = 0, figsize = (13,5), color = colors, zorder = 2)

# set titles, name both axis and set labels for y-axis
# add grids to the background of graph
# zorder - the layer of the grids or bars; alpha - transparency of grid

plt.title("Relationship between Driving Distance & Probability to Win", fontsize = 18)
plt.xlabel("Driving distance", fontsize = 14)
plt.ylabel("Probability to win", fontsize = 14)
plt.grid(zorder = 1, alpha = 0.6)

6. Where will the Bluezone shrink to?¶

Visualisation of the final bluezone locations

In [112]:
kill_df.shape
Out[112]:
(8234, 12)
In [113]:
match_df.shape
Out[113]:
(10000, 17)
In [114]:
# Select only the rows where the "team_placement" column has a value of 1

team_win = match_df [match_df ["team_placement"]==1]


# Find the last man standing from the 1st team
# Group the dataframe "team_win" by the "match_id" column and applies a lambda function to each group.
# The lambda function is used to filter each group and select the row with the max value in "player_survive_time" column

grouped = team_win.groupby('match_id').apply(lambda t: t[t.player_survive_time==t.player_survive_time.max()])


# Select only the rows of "kill_df" where value of "match_id" column is same with the "match_id" values of "grouped" 

deaths_solo = kill_df[kill_df['match_id'].isin (grouped ['match_id'].values)]


# Select only the rows where the "victim_placement" column has a value of 2

df_second = deaths_solo[(deaths_solo['victim_placement']==2)].dropna()

# display the new dataframe

df_second
Out[114]:
killed_by killer_name killer_placement killer_position_x killer_position_y map match_id time victim_name victim_placement victim_position_x victim_position_y
37 M416 DevilBlood35 1.0 504913.5 441667.2 ERANGEL 2U4GBNA0YmnRe95wOy7kuweIkuZo5Roa0WjNZsgboi2gzz... 31.416667 Player 219 2.0 503132.6 433288.4
102 SCAR-L Illmy0111 1.0 180234.1 512721.8 ERANGEL 2U4GBNA0YmlxiifTmjHuCGJhLNLL-lhlH9TQh47o-9IZVJ... 30.900000 Player 285 2.0 181216.5 513549.2
175 Down and Out Illmy0111 1.0 174635.6 510872.2 ERANGEL 2U4GBNA0YmlxiifTmjHuCGJhLNLL-lhlH9TQh47o-9IZVJ... 27.333333 Player 360 2.0 166706.2 538946.3
206 AKM VSUPciwei 1.0 396662.7 307742.9 ERANGEL 2U4GBNA0YmmLlZbT02zMNt2JlWQ2eYFKRG02TGemIK1RM2... 31.066667 Player 392 2.0 396531.8 304991.7
248 Down and Out VSUPciwei 1.0 396289.3 308352.7 ERANGEL 2U4GBNA0YmmLlZbT02zMNt2JlWQ2eYFKRG02TGemIK1RM2... 31.066667 Player 434 2.0 394014.9 303887.9
... ... ... ... ... ... ... ... ... ... ... ... ...
7827 M416 CNM_10086 1.0 321362.7 151641.6 ERANGEL 2U4GBNA0YmmkSI66uqmO1tdpb1le9dHphK-SEiVW9bCyMM... 31.416667 Player 9406 2.0 320446.6 145273.4
7998 Grenade gouxiongwang 1.0 501849.9 296173.3 ERANGEL 2U4GBNA0YmlAIXAxa8UNSqKMPaMblcwdWLI6-w44_RBSbY... 32.666667 Player 9582 2.0 502382.6 297894.0
8020 Kar98k R_eborn 3.0 505507.1 293815.9 ERANGEL 2U4GBNA0YmlAIXAxa8UNSqKMPaMblcwdWLI6-w44_RBSbY... 30.083333 Player 9604 2.0 492292.8 302644.7
8029 Kar98k R_eborn 3.0 505503.1 293733.7 ERANGEL 2U4GBNA0YmlAIXAxa8UNSqKMPaMblcwdWLI6-w44_RBSbY... 29.650000 Player 9613 2.0 495596.8 306128.8
8030 Down and Out R_eborn 3.0 505494.6 293797.7 ERANGEL 2U4GBNA0YmlAIXAxa8UNSqKMPaMblcwdWLI6-w44_RBSbY... 31.166667 Player 9614 2.0 494745.7 304383.4

202 rows × 12 columns

In [115]:
df_second.shape
Out[115]:
(202, 12)
In [116]:
# Data normalisation: Rescale the data in the "position" columns from their original values to a different scale.
# This is to bring the values into a more smaller range for further analysis or visualization.

position_data = ["victim_position_x", "victim_position_y","killer_position_x", "killer_position_y"]
for position in position_data:
 
    # Data normalisation
    # lambda is used to perform a simple calculation on each element of a column, without defining a specific function for it
    
    df_second[position] = df_second[position].apply(lambda x: x*4000/800000)
    
    # Remove any rows where the position value is zero.
    
    df_second = df_second[df_second[position]!=0]

# display the updated dataframe    

df_second
Out[116]:
killed_by killer_name killer_placement killer_position_x killer_position_y map match_id time victim_name victim_placement victim_position_x victim_position_y
37 M416 DevilBlood35 1.0 2524.5675 2208.3360 ERANGEL 2U4GBNA0YmnRe95wOy7kuweIkuZo5Roa0WjNZsgboi2gzz... 31.416667 Player 219 2.0 2515.6630 2166.4420
102 SCAR-L Illmy0111 1.0 901.1705 2563.6090 ERANGEL 2U4GBNA0YmlxiifTmjHuCGJhLNLL-lhlH9TQh47o-9IZVJ... 30.900000 Player 285 2.0 906.0825 2567.7460
175 Down and Out Illmy0111 1.0 873.1780 2554.3610 ERANGEL 2U4GBNA0YmlxiifTmjHuCGJhLNLL-lhlH9TQh47o-9IZVJ... 27.333333 Player 360 2.0 833.5310 2694.7315
206 AKM VSUPciwei 1.0 1983.3135 1538.7145 ERANGEL 2U4GBNA0YmmLlZbT02zMNt2JlWQ2eYFKRG02TGemIK1RM2... 31.066667 Player 392 2.0 1982.6590 1524.9585
248 Down and Out VSUPciwei 1.0 1981.4465 1541.7635 ERANGEL 2U4GBNA0YmmLlZbT02zMNt2JlWQ2eYFKRG02TGemIK1RM2... 31.066667 Player 434 2.0 1970.0745 1519.4395
... ... ... ... ... ... ... ... ... ... ... ... ...
7827 M416 CNM_10086 1.0 1606.8135 758.2080 ERANGEL 2U4GBNA0YmmkSI66uqmO1tdpb1le9dHphK-SEiVW9bCyMM... 31.416667 Player 9406 2.0 1602.2330 726.3670
7998 Grenade gouxiongwang 1.0 2509.2495 1480.8665 ERANGEL 2U4GBNA0YmlAIXAxa8UNSqKMPaMblcwdWLI6-w44_RBSbY... 32.666667 Player 9582 2.0 2511.9130 1489.4700
8020 Kar98k R_eborn 3.0 2527.5355 1469.0795 ERANGEL 2U4GBNA0YmlAIXAxa8UNSqKMPaMblcwdWLI6-w44_RBSbY... 30.083333 Player 9604 2.0 2461.4640 1513.2235
8029 Kar98k R_eborn 3.0 2527.5155 1468.6685 ERANGEL 2U4GBNA0YmlAIXAxa8UNSqKMPaMblcwdWLI6-w44_RBSbY... 29.650000 Player 9613 2.0 2477.9840 1530.6440
8030 Down and Out R_eborn 3.0 2527.4730 1468.9885 ERANGEL 2U4GBNA0YmlAIXAxa8UNSqKMPaMblcwdWLI6-w44_RBSbY... 31.166667 Player 9614 2.0 2473.7285 1521.9170

198 rows × 12 columns

In [117]:
# Set the plotting context to 'talk'
# The resulting plot will have larger fonts, thicker lines, and other adjustments optimized for a talk or presentation setting.
# This helps ensure that the visual elements of the plot are clear and easily visible to the audience.

sns.set_context('talk')

# Import the image of erangel map

image = Image.open(r"C:\Users\Justin\Downloads\ERANGEL.jpg")

# Adjust the brightness if map

enhancer = ImageEnhance.Brightness(image)

# Increase this value to make it lighter

brightness_factor = 1.3  

# apply the adjusted brightness to the image

brightened_image = enhancer.enhance(brightness_factor)
In [118]:
# Prepare the figure and subplot for further customization and plotting of data
# (1, 1) indicate that we want a single subplot in our figure
# figsize=(15, 15) sets the size of the figure to 15 inches by 15 inches

fig, ax = plt.subplots(1, 1, figsize=(15, 15)) 

# Create a KDE plot for visualisation

# 1. Use df_second as data source for plotting
# 2. Specify the x and y axis
# 3. cmap is used to set the blue colormap for the plot
# 4. An alpha value of 0.7 means that the plot will be slightly transparent
# 5. shade=True means the area under the curve is filled with color
# 6. ax=ax means to add to the subplot created earlier

plot = sns.kdeplot(data=df_second, x="victim_position_x", y="victim_position_y", cmap="Blues", alpha=0.7, shade=True, ax=ax)

# Reduce the contrast by adjusting alpha
# Adjust the alpha value to reduce transparency

plot.collections[8].set_alpha(0.8) 
plot.collections[0].set_alpha(0.3) 
plot.collections[1].set_alpha(0.5)

# Adjust the RGB values to reduce saturation
# (0.5, 0.5, 1) is a tuple that corresponds to a light blue color in the RGB value.

plot.collections[0].set_facecolor((0.5, 0.5, 1))  


# Get the limits of the axis and assign into the variables (min, max)

x_min, x_max = ax.get_xlim()
y_min, y_max = ax.get_ylim()

# Display image of map with the KDE plot
# Image will be displayed within the same x-axis and y-axis limits using the extend parameter
# For example, display from the minimum x coordinate and extened to the maximum x coordinate
# Aspect = 'auto' is used to match the ratio of the image and the subplot

ax.imshow(brightened_image, extent=[x_min, x_max, y_min, y_max], aspect='auto')

# Sett the title, and the x,y lables

plt.xlabel('x-coordinate', fontsize=18)
plt.ylabel('y-coordinate', fontsize=18)
plt.title('Final Bluezone locations', fontsize=20)

#Display the results

plt.show()


# In the KDE plot, higher density regions are shown by more intense colors
# and lower density regions indicated by lighter colors
C:\Users\Justin\AppData\Local\Temp\ipykernel_11832\3161281345.py:16: FutureWarning: 

`shade` is now deprecated in favor of `fill`; setting `fill=True`.
This will become an error in seaborn v0.14.0; please update your code.

  plot = sns.kdeplot(data=df_second, x="victim_position_x", y="victim_position_y", cmap="Blues", alpha=0.7, shade=True, ax=ax)
In [ ]: